In [23]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
from matplotlib import rcParams
import re
from wordcloud import WordCloud
from collections import Counter
import csv
from matplotlib import rcParams
from nltk.corpus import stopwords
import nltk
from nltk.util import ngrams
stop = stopwords.words('english')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,plot_confusion_matrix
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
In [24]:
true = pd.read_csv("input/True.csv")
false = pd.read_csv("input/Fake.csv")
true.head()
Out[24]:
title text subject date
0 As U.S. budget fight looms, Republicans flip t... WASHINGTON (Reuters) - The head of a conservat... politicsNews December 31, 2017
1 U.S. military to accept transgender recruits o... WASHINGTON (Reuters) - Transgender people will... politicsNews December 29, 2017
2 Senior U.S. Republican senator: 'Let Mr. Muell... WASHINGTON (Reuters) - The special counsel inv... politicsNews December 31, 2017
3 FBI Russia probe helped by Australian diplomat... WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews December 30, 2017
4 Trump wants Postal Service to charge 'much mor... SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews December 29, 2017
In [25]:
false.head()
Out[25]:
title text subject date
0 Donald Trump Sends Out Embarrassing New Year’... Donald Trump just couldn t wish all Americans ... News December 31, 2017
1 Drunk Bragging Trump Staffer Started Russian ... House Intelligence Committee Chairman Devin Nu... News December 31, 2017
2 Sheriff David Clarke Becomes An Internet Joke... On Friday, it was revealed that former Milwauk... News December 30, 2017
3 Trump Is So Obsessed He Even Has Obama’s Name... On Christmas day, Donald Trump announced that ... News December 29, 2017
4 Pope Francis Just Called Out Donald Trump Dur... Pope Francis used his annual Christmas Day mes... News December 25, 2017
In [26]:
true.subject.value_counts()
Out[26]:
politicsNews    11272
worldnews       10145
Name: subject, dtype: int64
In [27]:
rcParams['figure.figsize'] = 15,10
true.subject.value_counts().plot(kind="bar")
Out[27]:
<AxesSubplot:>
In [28]:
politics = true[true['subject']=="politicsNews"]
worldnews = true[true['subject']=="worldnews"]
print(politics.shape)
print(worldnews.shape)
(11272, 4)
(10145, 4)
In [29]:
politics_text_len = politics['text'].str.len()
worldnews_text_len = worldnews['text'].str.len()
print("The maximum lenght of string in Politcs news is {} words".format(max(politics_text_len)))
print("The maximum lenght of string in World news is {} words".format(max(worldnews_text_len)))
The maximum lenght of string in Politcs news is 29781 words
The maximum lenght of string in World news is 17999 words
In [30]:
def tokenizeandstopwords(text):
    tokens = nltk.word_tokenize(text)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    meaningful_words = [w for w in token_words if not w in stop]
    joined_words = ( " ".join(meaningful_words))
    return joined_words
In [31]:
politics['text'] = politics['text'].apply(tokenizeandstopwords)
worldnews['text'] = worldnews['text'].apply(tokenizeandstopwords)
<ipython-input-31-054355159e10>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politics['text'] = politics['text'].apply(tokenizeandstopwords)
<ipython-input-31-054355159e10>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  worldnews['text'] = worldnews['text'].apply(tokenizeandstopwords)
In [32]:
def generate_word_cloud(text):
    wordcloud = WordCloud(
        width = 3000,
        height = 2000,
        background_color = 'black').generate(str(text))
    fig = plt.figure(
        figsize = (40, 30),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()
In [33]:
politics_text = politics.text.values
generate_word_cloud(politics_text)
In [34]:
false.head()
Out[34]:
title text subject date
0 Donald Trump Sends Out Embarrassing New Year’... Donald Trump just couldn t wish all Americans ... News December 31, 2017
1 Drunk Bragging Trump Staffer Started Russian ... House Intelligence Committee Chairman Devin Nu... News December 31, 2017
2 Sheriff David Clarke Becomes An Internet Joke... On Friday, it was revealed that former Milwauk... News December 30, 2017
3 Trump Is So Obsessed He Even Has Obama’s Name... On Christmas day, Donald Trump announced that ... News December 29, 2017
4 Pope Francis Just Called Out Donald Trump Dur... Pope Francis used his annual Christmas Day mes... News December 25, 2017
In [35]:
set(false.subject)
Out[35]:
{'Government News', 'Middle-east', 'News', 'US_News', 'left-news', 'politics'}
In [36]:
true.subject.value_counts()
Out[36]:
politicsNews    11272
worldnews       10145
Name: subject, dtype: int64
In [37]:
false.subject.value_counts()
Out[37]:
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64
In [38]:
Government_News = false[false['subject']=="Government News"]
Middle_east = false[false['subject']=="Middle-east"]
News = false[false['subject']=="News"]
US_News = false[false['subject']=="US_News"]
politics = false[false['subject']=="politics"]
In [39]:
Government_News['text'] = Government_News['text'].apply(tokenizeandstopwords)
Middle_east['text'] = Middle_east['text'].apply(tokenizeandstopwords)
News['text'] = News['text'].apply(tokenizeandstopwords)
US_News['text'] = US_News['text'].apply(tokenizeandstopwords)
politics['text'] = politics['text'].apply(tokenizeandstopwords)
<ipython-input-39-e07793f1b127>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Government_News['text'] = Government_News['text'].apply(tokenizeandstopwords)
<ipython-input-39-e07793f1b127>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Middle_east['text'] = Middle_east['text'].apply(tokenizeandstopwords)
<ipython-input-39-e07793f1b127>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  News['text'] = News['text'].apply(tokenizeandstopwords)
<ipython-input-39-e07793f1b127>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  US_News['text'] = US_News['text'].apply(tokenizeandstopwords)
<ipython-input-39-e07793f1b127>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politics['text'] = politics['text'].apply(tokenizeandstopwords)
In [40]:
govertment_news_text = Government_News['text'].values
generate_word_cloud(govertment_news_text)
In [41]:
false['target'] = 'fake'
true['target'] = 'true'
news = pd.concat([false, true]).reset_index(drop = True)
news.head()
Out[41]:
title text subject date target
0 Donald Trump Sends Out Embarrassing New Year’... Donald Trump just couldn t wish all Americans ... News December 31, 2017 fake
1 Drunk Bragging Trump Staffer Started Russian ... House Intelligence Committee Chairman Devin Nu... News December 31, 2017 fake
2 Sheriff David Clarke Becomes An Internet Joke... On Friday, it was revealed that former Milwauk... News December 30, 2017 fake
3 Trump Is So Obsessed He Even Has Obama’s Name... On Christmas day, Donald Trump announced that ... News December 29, 2017 fake
4 Pope Francis Just Called Out Donald Trump Dur... Pope Francis used his annual Christmas Day mes... News December 25, 2017 fake
In [42]:
news['text'] = news['text'].apply((lambda y:re.sub("http://\S+"," ", y)))
news['text'] = news['text'].apply((lambda x:re.sub("\@", " ",x.lower())))
In [43]:
news.head()
Out[43]:
title text subject date target
0 Donald Trump Sends Out Embarrassing New Year’... donald trump just couldn t wish all americans ... News December 31, 2017 fake
1 Drunk Bragging Trump Staffer Started Russian ... house intelligence committee chairman devin nu... News December 31, 2017 fake
2 Sheriff David Clarke Becomes An Internet Joke... on friday, it was revealed that former milwauk... News December 30, 2017 fake
3 Trump Is So Obsessed He Even Has Obama’s Name... on christmas day, donald trump announced that ... News December 29, 2017 fake
4 Pope Francis Just Called Out Donald Trump Dur... pope francis used his annual christmas day mes... News December 25, 2017 fake
In [44]:
def basic_clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english')
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]
In [46]:
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bima\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
Out[46]:
True
In [47]:
true_word = basic_clean(''.join(str(true['text'].tolist())))
In [48]:
x_train,x_test,y_train,y_test = train_test_split(news['text'], news.target, test_size=0.2, random_state=2020)

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
accuracy: 98.76%
In [49]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))
[[4674   66]
 [  45 4195]]
              precision    recall  f1-score   support

        fake       0.99      0.99      0.99      4740
        true       0.98      0.99      0.99      4240

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

In [50]:
plot_confusion_matrix(model,x_test,y_test)
Out[50]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1945d4b4ca0>
In [ ]: